import pyprind

import pandas as pd

import os

basepath='aclImdb'

labels={'pos':1, 'neg':0}

pbar=pyprind.ProgBar(50000)

df=pd.DataFrame()

for s in('test', 'train'):

for l in ('pos', 'neg'):

path=os.path.join(basepath, s, l)

for file in sorted(os.listdir(path)):

with open(os.path.join(path, file), 'r', encoding='utf-8') as infile:

txt=infile.read()

df=df.append([[txt, labels[l]]], ignore_index=True)

pbar.update()

df.columns=['review', 'sentiment']

# 데이타 섞기 및 CSV로 저장

import numpy as np

np.random.seed(0)

df=df.reindex(np.random.permutation(df.index))

df.to_csv('movie_data.csv', index=False, encoding='utf-8')

using CSV

df=pd.read_csv('movie_data.csv', encoding='utf-8')

# data preprocess

import re

def preprocessor(text):

text=re.sub('<[^>]*', '' , text) #HTML 마크업을 삭제

emoticons=re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)

text=(re.sub('[\W]+', ' ', text.lower())+' '.join(emoticons).replace('-', ''))

return text

df['review']=df['review'].apply(preprocessor)

#split data

X_train=df.loc[:25000, 'review'].values

y_train=df.loc[:25000, 'sentiment'].values

X_test=df.loc[25000:, 'review'].values

y_test=df.loc[25000:, 'sentiment'].values

IMDb text data